{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 导入依赖"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from encoder.encoder import Encoder\n",
    "from transcriptor.whisperx import WhisperX\n",
    "from langchain.vectorstores.pgvector import PGVector\n",
    "from langchain.docstore.document import Document\n",
    "from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip\n",
    "import utils\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv(\"env/connection.env\")\n",
    "\n",
    "COLLECTION_NAME = \"pt\"  # or 'en'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将视频转换为音频文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "MoviePy - Writing audio in data/pt.wav\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                        "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MoviePy - Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    }
   ],
   "source": [
    "utils.convert_to_wav(f\"data/{COLLECTION_NAME}.mp4\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 使用 WhiperX 转录音频，并创建将存储在 Postgres 中的文档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "Failed to align segment (\" that it would have been a little more difficult to do so. But I was able to\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" didn't know what to do for a few months. I was just thinking. And then I was back in\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" my old ways. I was a little bit of a coward. I was thinking of my old ways. I was thinking,\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" you know, I'm not really doing that, but I'm just going to do it my way. And then I was\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" thinking, I'm going to do it my way. And then I'm going to do it my way. And then I'm going\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" often the case that I was fired from the company because I didn't have the money to pay for\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" the movie I was working on. I was a little bit of a loser, but I was a little bit of a loser.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was a little bit of a loser, but I was a little bit of a loser. I was a little bit of a loser.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" I was in a coma.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" It was a great invention.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" It was a great invention.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" It was a great invention.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" It was a great invention.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" It was a great invention.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" It was a great invention.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" It was a great invention.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" It is starting today.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" It is starting today.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" This is our year onemore.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" This is our year two.\"): backtrack failed, resorting to original...\n",
      "Failed to align segment (\" .\"): no characters in this segment found in model dictionary, resorting to original...\n"
     ]
    }
   ],
   "source": [
    "# whisperX in cpu is too slow, we used large whisper\n",
    "whisperx = WhisperX(model_name=\"whisper\")\n",
    "transcription = whisperx.transcribe(f\"data/{COLLECTION_NAME}.wav\")\n",
    "\n",
    "# create documents to store in Postgres\n",
    "docs = [\n",
    "    Document(page_content=f'start {item[\"start\"]} - end {item[\"end\"]}: {item[\"text\"]}')\n",
    "    for item in transcription[\"segments\"]\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 建立连接设置"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "CONNECTION_STRING = PGVector.connection_string_from_db_params(\n",
    "    driver=os.getenv(\"DRIVER\"),\n",
    "    host=os.getenv(\"HOST\"),\n",
    "    port=os.getenv(\"PORT\"),\n",
    "    database=os.getenv(\"DATABASE\"),\n",
    "    user=os.getenv(\"USERNAME\"),\n",
    "    password=os.getenv(\"PASSWORD\"),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将 embedding 后内容插入 Postgres"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "db = PGVector.from_documents(\n",
    "    embedding=Encoder().encoder,\n",
    "    documents=docs,\n",
    "    collection_name=COLLECTION_NAME,\n",
    "    connection_string=CONNECTION_STRING,\n",
    "    pre_delete_collection=True,  # deletes previous records, useful for testing\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 查询数据库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Huub use case with portuguese audio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Query PT: 'marcas e investimentos' | Top 4  results:\n",
      "start 361.82 - end 362.865:  to run the company with me.\n",
      "start 366.16 - end 367.907:  visions of the future began to diverge.\n",
      "start 462.8 - end 463.583:  current renaissance.\n",
      "start 354.96 - end 355.823:  company you started?\n",
      "Query EN: 'brands and investments' | Top 4 results:\n",
      "start 354.96 - end 355.823:  company you started?\n",
      "start 385.825 - end 389.68: I felt that I had let the previous generation of entrepreneurs down, that I had\n",
      "start 338.18 - end 342.76:  years Apple had grown from just the two of us in a garage into a $2 billion company with\n",
      "start 436.067 - end 439.74: During the next five years, I started a company named Next, another company\n"
     ]
    }
   ],
   "source": [
    "similar_docs_pt = db.similarity_search(\"marcas e investimentos\", k=4)\n",
    "similar_docs_en = db.similarity_search(\"brands and investments\", k=4)\n",
    "\n",
    "print(\"Query PT: 'marcas e investimentos' | Top 4  results:\")\n",
    "print(\"\\n\".join([x.page_content for x in similar_docs_pt]))\n",
    "print(\"Query EN: 'brands and investments' | Top 4 results:\")\n",
    "print(\"\\n\".join([x.page_content for x in similar_docs_en]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 使用英语音频的深度学习使用案例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Query PT: 'modelos de aprendizagem profunda' | Top 8 result:\n",
      "start 384.64 - end 384.861:  months.\n",
      "Query EN: 'deep learning models' | Top 1 result:\n",
      "start 195.14 - end 196.159:  Let me give you one example.\n"
     ]
    }
   ],
   "source": [
    "similar_docs_pt = db.similarity_search(\"modelos de aprendizagem profunda\", k=8)\n",
    "similar_docs_en = db.similarity_search(\"deep learning models\", k=8)\n",
    "\n",
    "print(\"Query PT: 'modelos de aprendizagem profunda' | Top 8 result:\")\n",
    "print(similar_docs_pt[-1].page_content)\n",
    "print(\"Query EN: 'deep learning models' | Top 1 result:\")\n",
    "print(similar_docs_en[0].page_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Query PT: 'distribuição normal' | Top 4 results:\n",
      "start 175.54 - end 179.659:  I returned Coke bottles for the five cent deposits to buy food with, and\n",
      "start 462.8 - end 463.583:  current renaissance.\n",
      "start 6.248 - end 6.669:  Thank you.\n",
      "start 279.04 - end 282.273:  personal computers might not have the wonderful typography that they do.\n",
      "Query EN: 'normal distribution' | Top 4 results:\n",
      "start 342.8 - end 344.125:  over 4,000 employees.\n",
      "start 499.627 - end 502.839: And the only way to do great work is to love what you do.\n",
      "start 492.723 - end 497.64:  Your work is gonna fill a large part of your life, and the only way to be truly satisfied is to do\n",
      "start 351.049 - end 351.972: And then I got fired.\n"
     ]
    }
   ],
   "source": [
    "similar_docs_pt = db.similarity_search(\"distribuição normal\", k=4)\n",
    "similar_docs_en = db.similarity_search(\"normal distribution\", k=4)\n",
    "\n",
    "print(\"Query PT: 'distribuição normal' | Top 4 results:\")\n",
    "print(\"\\n\".join([x.page_content for x in similar_docs_pt]))\n",
    "print(\"Query EN: 'normal distribution' | Top 4 results:\")\n",
    "print(\"\\n\".join([x.page_content for x in similar_docs_en]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "zaai",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
